
## CoreLogic summaries

for(this_year in seq(2012,2020,4)){
  print(glue('Working in {this_year}...'))
  
  cl_clusters <- open_dataset('data/all_components') %>%
    filter(str_starts(id, glue('cl-{str_sub(this_year,3,4)}'))) %>%
    collect()
  
  if(! 'data.table' %in% class(cl_clusters)) cl_clusters <- as.data.frame(cl_clusters)
  
  setDT(cl_clusters, key = 'id')
  
  out <- open_dataset('data/cl_in/') %>%
    filter(vintage == this_year) %>%
    select(
      cl_row_id,
      person,
      uid,
      c(census_id:calc_total_value),
      c(assd_total_value:tax_amt),
      property_type,
      occupancy,
      fips_st,
      fips,
      gender_cl = gender
    ) %>%
    mutate(id = str_replace_all(uid, '.{2}$', ''), occupied = occupancy == 'O') %>%
    select(-c(uid, occupancy)) %>%
    collect()
  
  if(! 'data.table' %in% class(out)) out <- as.data.frame(out)
  
  setDT(out, key = 'id')
  
  out <- cl_clusters[out, on = 'id']
  
  out[, component := fifelse(is.na(component), str_c(cl_row_id, person, sep = '-'), as.character(component))]
  out[, person := NULL]
  
  setkey(out, component)
  
  out <- unique(out, by = c('component', 'cl_row_id'))
  
  setorder(out, component, -occupied, -calc_total_value, -tax_amt)
  
  out2 = out[, .SD[1], by = 'component', .SDcols = c(
    'fips_st',
    'census_id',
    'site_lat',
    'site_long',
    'property_type',
    'fips',
    'gender_cl'
  )]
  
  setnames(out2,'fips_st','resid_state')
  
  out = out[, .(
    n_prop = .N,
    n_val = sum(!is.na(calc_total_value)),
    total = sum(calc_total_value, na.rm = T),
    tax = sum(tax_amt, na.rm = T),
    n_state = length(unique(fips_st))
  ), by = 'component'][, cycle := this_year]
  
  out <- out[out2, on = 'component']
  
  write_dataset(out, glue('data/cl_summ_{this_year}'), max_rows_per_file = 1e6)
  
}
